(ql:quickload :ips)

To load "ips":
  Load 1 ASDF system:
    ips

; Loading "ips"
[package yason.system]

.

(:IPS)

(in-package :ips)

#<PACKAGE "IPS">

(load #P"IPS:DATA;coupons")

T

;;   TYPE    NAME             ITEM           REG-PRICE DISC-PRICE
;; 0 Italian Domo's           Pizza                 20         10
;; 1 Italian Mama Rita's      Pizza                 20         12
;; 2 BBQ     Smokey McSween's Barbecue              30         17
;; 3 BBQ     Smokey Grill     Ribs                  20         11
;; 4 Mexican Dos Amigos       Tacos                 16          8
;; 5 Mexican Holy Guacamole   Steak fajitas         13          8
;; 6 Seafood Sea Grill        Shrimp platter        20         11

(describe coupons)

COUPONS
  A data-frame with 7 observations of 5 variables

Variable   | Type        | Unit | Label                   
--------   | ----        | ---- | -----------             
TYPE       | CATEGORICAL | NIL  | Type of restaurant      
NAME       | CATEGORICAL | NIL  | Name of restaurant      
ITEM       | CATEGORICAL | NIL  | Discounted item         
REG-PRICE  | INTEGER     | USD  | Regular price of item   
DISC-PRICE | INTEGER     | USD  | Discounted price of item

(/ 800 1000)

4/5

(/ 1200 2000)

3/5

(defdf online (read-csv ips:eg01-07))

#<DATA-FRAME (4 observations of 2 variables)>

(print-data online)

NIL

;;   SOURCE    COUNT
;; 0 Google      406
;; 1 Library      75
;; 2 Wikipedia    52
;; 3 Other        19

(sum online:count)

552

(e* 100
    (e/ online:count 552.0))

#(73.55073 13.586956 9.42029 3.442029)

(load #P"IPS:DATA;online-data")

T

(variables)

(SOURCES)

(tabulate sources)

#<SPARSE-COUNTER tally: 552, varieties: 4
  google  406  (73.6%)
  library  75  (13.6%)
  wikipedia  52  (9.4%)
  other  19  (3.4%)>

(vega:defplot online-bar-chart `(:mark :bar
                                 :data (:values ,online)
                                 :encoding (:x (:field source :type "nominal"      :title "Online resource")
                                            :y (:field count  :type "quantitative" :title "Percentage preference"))))

#<PLOT ONLINE-BAR-CHART: Bar chart
NIL>

(ips:plot online-bar-chart)

(ips:plot
 (vega:defplot online-bar-chart `(:mark :bar
                                  :width 300
                                  :data (:values ,online)
                                  :encoding (:x (:field source :type "nominal"      :title "Online resource")
                                             :y (:field count  :type "quantitative" :title "Percentage preference")))))

(ips:plot
 (vega:defplot online-bar-chart `(:mark :bar
                                  :width 300
                                  :data (:values ,online)
                                  :encoding (:x (:field source :type "nominal"      :title "Online resource" :sort "-y")
                                             :y (:field count  :type "quantitative" :title "Percentage preference")))))

(ips:plot
 (vega:defplot online-bar-chart `(:mark :arc
                                  :width 300
                                  :data (:values ,online)
                                  :encoding (:color (:field source :type "nominal" :title "Online resource")
                                             :theta (:field count  :type "quantitative")))))

(defdf scf (read-csv ips:eg01-11))

#<DATA-FRAME (46 observations of 3 variables)>

(summary scf)

(23 (50%) x "Control", 23 (50%) x "SCF",  46 reals, min=31, q25=43.166664,
 q50=48.4, q75=55, max=76)

(print-df scf)

(stem-and-leaf (select scf (range 23 nil) 'absorption)) ;arrays are 0 based, so we start at 23
                                                        ;nil means to the end of the vector

NIL

3 | 1 5
4 | 2 3 3 3 4 4 5 7 8 9
5 | 0 0 3 3 4 9
6 | 1 2
7 | 0 3 6

(back-to-back-stem-and-leaf (select scf (range  0 22)  'absorption)
                            (select scf (range 23 nil) 'absorption))

NIL

                 8 5 3 | 3 | 1 5
 9 9 8 7 7 7 6 3 2 2 1 | 4 | 2 3 3 3 4 4 5 7 8 9
             9 5 3 1 1 | 5 | 0 0 3 3 4 9
                   8 3 | 6 | 1 2
                     2 | 7 | 0 3 6

(defdf iq-scores (read-csv ips:eg01-14))

#<DATA-FRAME (60 observations of 1 variables)>

(plot/text:hist (75 155 10) (mapcar #'plot/text:hist-record (coerce iq-scores:iq 'list)))

NIL

< 75:       0
75:         2       ## 
85:         3       ### 
95:         10      ########## 
105:        16      ################ 
115:        13      ############# 
125:        10      ########## 
135:        5       ##### 
145:        1       # 
> 155:      0

(ips:plot
 (vega:defplot iq-plot `(:mark :bar
                         :data (:values ,iq-scores)
                         :width 300
                         :encoding (:x (:field iq :bin (:maxbin 8))
                                    :y (:aggregate "count")))))

(defdf call-times (read-csv ips:eg01-16))

#<DATA-FRAME (31492 observations of 1 variables)>

(ips:plot
 (vega:defplot call-plot `(:mark :bar
                           :data (:values ,(filter-rows call-times '(> 1200 length)))
                           :width 300
                           :encoding (:x (:field length :bin (:step 10) :title "Service time (seconds)")
                                      :y (:aggregate "count")))))

(mean (e2<= call-times:length 10))

0.07627333926076495d0

(defdf college-students (read-csv ips:eg01-19))

#<DATA-FRAME (50 observations of 4 variables)>

(ips:plot
 (vega:defplot student-plot `(:mark :bar
                              :data (:values ,college-students)
                              :width 300
                              :encoding (:x (:field undergrads :bin (:step 300000 :anchor 150000) :title "Undergraduates")
                                         :y (:aggregate "count")))))

(print-df
 (filter-rows college-students `(= undergrads ,(seq-max college-students:undergrads))))

(stem-and-leaf (map 'vector #'round college-students:ugradperthou)) ; round to nearest whole thousand

NIL

3 | 8
4 | 1 1 1 1 1 3 3 5 5 5 5 6 6 6 7 7 7 7 7 8 8 8 8 9
5 | 0 0 1 1 1 2 2 2 4 4 4 4 5 6 6
6 | 0 0 0 0 1 7 9
7 | 1 2 7

(print-df
 (filter-rows college-students `(> ugradperthou 76)))

(defdf pth (read-csv ips:eg01-21))

#<DATA-FRAME (29 observations of 1 variables)>

(stem-and-leaf pth:pth)

NIL

 1 | 9
 2 | 5 8 8 8 9
 3 | 0 1 1 1 3 5 8 9
 4 | 0 5 6 8 9 9
 5 | 0 0 9 9
 6 | 3 4
 7 | 1 1
 8 |
 9 |
10 |
11 |
12 | 7

(load #P"IPS:DATA;vitamin-d")

T

(ips:plot
 (vega:defplot vd-by-mth `(:data (:values ,vitamin-d)
                           :width 300
                           :layer #((:mark :point
                                     :encoding (:x (:field :months)
                                                :y (:field :vitamind
                                                    :type  :quantitative)))
                                     (:mark (:type :line
                                             :color :red)
                                      :encoding (:x (:field :months)
                                                 :y (:field :vitamind
                                                     :aggregate :mean
                                                     :title "Vitamin D (nmol/l)")))))))

(defdf ttsb (read-csv ips::eg01-23))

#<DATA-FRAME (24 observations of 4 variables)>

(stem-and-leaf ttsb:time)

NIL

0 | 2 4 5 5 5 5 6 6 7 8
1 | 0 1 2 3 6 7 9 9
2 | 4 5
3 | 2 8
4 | 9
5 | 3

(mean ttsb:time)

391/24

(round-float (float cl:*) :precision 1) ; * is replaced by the last value, in this case 391/24.  Rounded, as in the text on page 29.

16.3

(median ttsb:time)

23/2

(round-float (float cl:*) :precision 1)

11.5

(fivenum ttsb:time) ; use :tukey t for the R style version of this function

#(2 5.5 11.5 21.5 53)

(load #P"IPS:DATA;sample-call-times")

T

(fivenum sample-call-times)

#(1 54.5 103.5 200.0 2631)

(ips:plot
 (vega:defplot iq-box-plot
   `(:data (:values ,iq-scores)
     :mark (:type :boxplot
            :extent "min-max")
     :encoding (:y (:field :iq
                    :type :quantitative
                    :scale (:zero :false)
                    :title "IQ")))))

(interquartile-range sample-call-times)

145.5

(let-plus:let+ ((threshold (* 1.5 (interquartile-range sample-call-times)))
                (#(q1 q3) (quantiles sample-call-times '(0.25 0.75)))
                (lower-threshold (- q1 threshold))
                (upper-threshold (+ q3 threshold)))
  (loop
    for x across sample-call-times
    when (or (< x lower-threshold)
             (> x upper-threshold))
      collect x into outliers
    finally (return (sort outliers #'<))))

(438 465 479 700 700 951 1148 2631)

(ips:plot
 (vega:defplot calls-box-plot
   `(:data (:values ,(plist-df `(:x ,sample-call-times)))
     :width 500
     :mark (:type :boxplot)
     :encoding (:x (:field x
                    :type :quantitative
                    :scale (:zero :false)
                    :title "Call Centre Times")))))

(defdf writers (read-csv ips:eg01-31))

#<DATA-FRAME (123 observations of 3 variables)>

(ips:plot
 (vega:defplot writer-deaths
   `(:data (:values ,writers)
     :width 150
     :mark (:type :boxplot)
     :encoding (:x (:field :type
                    :type :nominal
                    :title "Type of writer")
                :y (:field age
                    :type :quantitative
                    :scale (:zero :false)
                    :title "Age at death")
                :color (:field type
                        :type :nominal
                        :legend nil)))))

(def x #(1792 1666 1362 1614 1460 1867 1439)) ; define a variable X with a few values
(sqrt (/ (reduce #'+ (esquare (e- x (mean x))))
         (1- (length x))))

X

189.2397

(variance x)

107435/3

(float *)  ; call float to convert rational (exact) to a floating point approximation

35811.668

(sd x) ; square root of variance

189.23970689753952d0

(defdf grades (plist-df '(score #(1056 1080 900 1164 1020))))

#<DATA-FRAME (5 observations of 1 variables)>

(mean-sd-n grades:score)

1044

96.37427

5

(add-column! grades 'points (e/ grades:score 4) t) ; note the last parameter, 't', tells add-column to update the environment so we can refer to the column by name

#<DATA-FRAME (5 observations of 2 variables)>

(mean-sd-n grades:points)

261

24.093567

5

(defdf passenger-ages (read-csv ips:eg01-36))

#<DATA-FRAME (1309 observations of 14 variables)>

 (ips:plot  (vega:defplot age-plot `(:data (:values ,passenger-ages)
                          :width 300
                          :layer #((:mark :bar
                                    :encoding (:x (:field :age :bin (:maxbin 8))
                                               :y (:aggregate "count")))
                                   (:mark (:type :line :stroke :red :interpolate :natural)
                                    :encoding (:x (:field :age :bin (:maxbin 8))
                                               :y (:aggregate :count)))))))

 (ips:plot
  (vega:defplot ttsb-plot `(:data (:values ,ttsb)
                            :width 300
                            :layer #((:mark :bar
                                      :encoding (:x (:field :time :bin (:maxbin 8))
                                                 :y (:aggregate "count")))
                                     (:mark (:type :line :stroke :red :interpolate :natural)
                                      :encoding (:x (:field :time :bin (:maxbin 8))
                                                 :y (:aggregate :count)))))))

(defparameter rn 
    (r-normal 1010 (square 225))) ; First get a parameterised normal distribution object

(cdf rn 800) ; cumulative density function

RN

0.17532394485222946d0

(- (cdf rn 800)
   (cdf rn 620))

0.13380572516345035d0

(defparameter sn 
    (r-normal 0 1))

SN

(cdf sn 1.47)

0.9292191268826562d0

(defparameter rn 
    (r-normal 500 (square 120))) ; Lisp stat functions use variance, not standard-deviation

RN

(quantile rn 0.9)

653.7861715630735d0

(ips:plot (vega:defplot qq (vega:qq-plot iq-scores:iq sn)))

(ips:plot (vega:defplot qq-ttsb (vega:qq-plot ttsb:time sn)))

ID	TREATMENT	ABSORPTION
1	Control	42
2	Control	33
3	Control	41
4	Control	49
5	Control	42
6	Control	47
7	Control	48
8	Control	47
9	Control	53
10	Control	72
11	Control	47
12	Control	63
13	Control	68
14	Control	59
15	Control	35
16	Control	46
17	Control	43
18	Control	55
19	Control	38
20	Control	49
21	Control	51
22	Control	51
23	Control	66
24	SCF	50
25	SCF	43
26	SCF	43
27	SCF	44
28	SCF	50
29	SCF	44
30	SCF	35
31	SCF	49
32	SCF	54
33	SCF	76
34	SCF	31
35	SCF	48
36	SCF	61
37	SCF	70
38	SCF	62
39	SCF	47
40	SCF	42
41	SCF	45
42	SCF	43
43	SCF	59
44	SCF	53
45	SCF	53
46	SCF	73

Introduction¶

Setup¶

Example 1.1 Restaurant discount coupons¶

Example 1.2 Categorical and quantitative variables for coupons¶

Example 1.6 Comparing colleges based on graduates¶

1.2 Displaying distributions with graphs¶

Categorical variables: Bar graphs and pie charts¶

Example 1.7 How do you do online research?¶

Example 1.8 Favorites as percents¶

Example 1.9 Bar graph for the online resource preference data¶

Example 1.10 Pie chart for the online resource preference data¶

Quantitative Variables: Stemplots and histograms¶

Example 1.11 Soluble corn fiber and calcium¶

Example 1.12 Back-to-back stemplot¶

Example 1.13 Stemplot with split stems¶

Example 1.14 - Distribution of IQ scores¶

Example 1.16 Histogram for customer service call lengths¶

Dealing with Outliers¶

Example 1.19 College students¶

Example 1.20 College students per 1000 population¶

Example 1.21 Healthy bones and PTH¶

Time plots¶

Example 1.22 Seasonal variation in vitamin D¶

1.3 Describing distributions with numbers¶

Measuring the center: The mean¶

Example 1.24 - Mean time to start a business¶

Measuring the center: The median¶

Example 1.25 - Median time to start a business¶

Measuring the spread: The quartiles¶

Example 1.26 - Finding quartiles¶

Example 1.27 - Call center call lengths¶

Example 1.28 - Boxplots¶

Interquartile range & outliers¶

Example 1.29 - IQR for call length¶

Example 1.30 Outliers for call length¶

Example 1.31 Do poets die young?¶

Measuring spread: standard deviation¶

Linear transformations¶

1.4 Density curves and normal distributions¶

Example 1.36 Example density curves¶

Example 1.40 - NCAA eligibility for competition¶

Example 1.41 - NCAA eligibility for aid and practice¶

Example 1.42 - Find the proportion from Z¶

Example 1.45 - How high for the top 10%¶

Example 1.46 - IQ scores are approximately normal¶

Example 1.47 - Times to start a business are skewed¶